import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
# Load the data
data_path = r"C:\Users\Owner\Desktop\GA_data\Original.csv"
df = pd.read_csv(data_path)
df.head()
# Assuming 'Join_count' is your dependent variable, and other columns are independent variables
X = df.drop('EVI', axis=1) # Independent variables
y = df['EVI'] # Dependent variable
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=43)
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
# Elastic Net Regression
elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)
# Support Vector Regression
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
# Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
# Evaluate the models
def evaluate_model(model, y_test, y_pred):
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Evaluate Linear Regression
print("\nLinear Regression:")
evaluate_model(linear_model, y_test, y_pred_linear)
# Evaluate Lasso Regression
print("\nLasso Regression:")
evaluate_model(lasso_model, y_test, y_pred_lasso)
# Evaluate Ridge Regression
print("\nRidge Regression:")
evaluate_model(ridge_model, y_test, y_pred_ridge)
# Evaluate Elastic Net Regression
print("\nElastic Net Regression:")
evaluate_model(elastic_net_model, y_test, y_pred_elastic_net)
# Evaluate Support Vector Regression
print("\nSupport Vector Regression:")
evaluate_model(svr_model, y_test, y_pred_svr)
# Evaluate Decision Tree Regression
print("\nDecision Tree Regression:")
evaluate_model(tree_model, y_test, y_pred_tree)
# Evaluate Random Forest Regression
print("\nRandom Forest Regression:")
evaluate_model(rf_model, y_test, y_pred_rf)
# Cross-Validation
def cross_validate(model, x_data, y_data, cv=5):
y_pred = cross_val_predict(model, x_data, y_data, cv=cv)
r2 = r2_score(y_data, y_pred)
mse = mean_squared_error(y_data, y_pred)
return y_pred, r2, mse
# Cross-Validation for each model
linear_cv_preds, linear_cv_r2, linear_cv_mse = cross_validate(linear_model, X_scaled, y)
lasso_cv_preds, lasso_cv_r2, lasso_cv_mse = cross_validate(lasso_model, X_scaled, y)
ridge_cv_preds, ridge_cv_r2, ridge_cv_mse = cross_validate(ridge_model, X_scaled, y)
elastic_net_cv_preds, elastic_net_cv_r2, elastic_net_cv_mse = cross_validate(elastic_net_model, X_scaled, y)
svr_cv_preds, svr_cv_r2, svr_cv_mse = cross_validate(svr_model, X_scaled, y)
tree_cv_preds, tree_cv_r2, tree_cv_mse = cross_validate(tree_model, X_scaled, y)
rf_cv_preds, rf_cv_r2, rf_cv_mse = cross_validate(rf_model, X_scaled, y)
# Display Cross-Validation Results
cv_results = pd.DataFrame({
'Model': ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression'],
'R-Squared': [linear_cv_r2, lasso_cv_r2, ridge_cv_r2, elastic_net_cv_r2,
svr_cv_r2, tree_cv_r2, rf_cv_r2],
'Mean Squared Error': [linear_cv_mse, lasso_cv_mse, ridge_cv_mse, elastic_net_cv_mse,
svr_cv_mse, tree_cv_mse, rf_cv_mse]
})
print("\nCross-Validation Results:")
print(cv_results)
# Model Multicollinearity Test (VIF)
def calculate_vif(X):
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
return vif_data
# Calculate VIF for independent variables
vif_results = calculate_vif(X)
print("\nVIF Results:")
print(vif_results)
# Plotting predictions vs actual values for each model
models = [linear_model, lasso_model, ridge_model, elastic_net_model, svr_model, tree_model, rf_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression']
plt.figure(figsize=(15, 10))
for model, name in zip(models, model_names):
y_pred = model.predict(X_test)
plt.scatter(y_test, y_pred, label=name)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2) # Diagonal line
plt.xlabel("Actual Enhanced Vegetation Index")
plt.ylabel("Predicted Vegetation Productivity")
plt.title("EVI vegetation productivity - All Regression Models on Original Data")
plt.legend()
plt.show()
# Plotting coefficients for Linear Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_linear = pd.Series(linear_model.coef_, index=X.columns)
ft_importances_lm_linear.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("Original Data Coefficients - Linear Regression")
plt.show()
# Plotting coefficients for Lasso Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_lasso = pd.Series(lasso_model.coef_, index=X.columns)
ft_importances_lm_lasso.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("Original Data Coefficients - Lasso Regression")
plt.show()
# Plotting coefficients for Ridge Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_ridge = pd.Series(ridge_model.coef_, index=X.columns)
ft_importances_lm_ridge.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("Original Data Coefficients - Ridge Regression")
plt.show()
# Plotting coefficients for Elastic Net Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_elastic_net = pd.Series(elastic_net_model.coef_, index=X.columns)
ft_importances_lm_elastic_net.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("Original Data Coefficients - Elastic Net Regression")
plt.show()
# Support Vector Regression does not provide coefficients in the same way as linear models
# Plotting feature importance for Decision Tree Regression
plt.figure(figsize=(15, 10))
ft_importances_tree = pd.Series(tree_model.feature_importances_, index=X.columns)
ft_importances_tree.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("Original Data Feature Importance - Decision Tree Regression")
plt.show()
# Plotting feature importance for Random Forest Regression
plt.figure(figsize=(15, 10))
ft_importances_rf = pd.Series(rf_model.feature_importances_, index=X.columns)
ft_importances_rf.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("Original Data Feature Importance - Random Forest Regression")
plt.show()
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.721e+07, tolerance: 8.868e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.782e+05, tolerance: 8.868e+04 model = cd_fast.enet_coordinate_descent(
Linear Regression: Mean Squared Error: 223331.15782518263 R-squared: 0.7964451599002667 Lasso Regression: Mean Squared Error: 222595.76858534582 R-squared: 0.7971154292911737 Ridge Regression: Mean Squared Error: 221551.15979959743 R-squared: 0.7980675363613221 Elastic Net Regression: Mean Squared Error: 220207.85858986876 R-squared: 0.7992918861816286 Support Vector Regression: Mean Squared Error: 374004.55071556923 R-squared: 0.6591141278322075 Decision Tree Regression: Mean Squared Error: 209987.18842077037 R-squared: 0.808607500277945 Random Forest Regression: Mean Squared Error: 106268.82049654558 R-squared: 0.903141447102987
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.305e+07, tolerance: 9.828e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.564e+07, tolerance: 9.972e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.008e+08, tolerance: 1.064e+05 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.025e+07, tolerance: 9.586e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.695e+07, tolerance: 1.100e+05 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.152e+05, tolerance: 1.064e+05 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.086e+05, tolerance: 1.100e+05 model = cd_fast.enet_coordinate_descent(
Cross-Validation Results:
Model R-Squared Mean Squared Error
0 Linear Regression 0.326579 710199.371805
1 Lasso Regression 0.348073 687531.375665
2 Ridge Regression 0.393373 639757.531989
3 Elastic Net Regression 0.492821 534878.717769
4 Support Vector Regression 0.524563 501402.773271
5 Decision Tree Regression 0.594728 427405.415835
6 Random Forest Regression 0.793910 217345.728136
VIF Results:
Variable VIF
0 Psum 12.146660
1 Temp 42.639319
2 pp 84.841281
3 gdp 247.855007
4 gova 18.969250
5 aa 10.041142
6 gp 9.582982
7 ls 1.623550
8 fai 36.072238
9 lgr 45.481630
10 iofp 2.607295
11 loh 6.893733
12 rpop 19.704394
13 hmp 36.194699
14 hpb 33.042339
15 mht 58.641005
16 pte 14.939183
17 tcrv 120.437473
18 pcurei 1.962042
19 pio 32.749696
20 sio 33.612325
21 tio 60.228057
22 croppg 3.471574
23 forest 2.261841
24 grass 20.862482
25 sand 2.162086
26 shrub 1.526806
27 urban 1.715978
28 water 2.176167
29 wetland 1.823453
30 bus_p 20.604707
31 dustgdp 1285.329290
32 fosgdp 3.716668
33 inter_p 31.507092
34 invpa_p 14.234124
35 mobile_p 10.779286
36 noxgdp 4.492853
37 road_p 32.125470
38 sdxgdp 1282.351539
39 taxiv 33.683888
40 telev 8.832584
41 wateru 8.771482
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
# Load the data
data_path = r"C:\Users\Owner\Desktop\GA_data\EMD_Residual.csv"
df = pd.read_csv(data_path)
df.head()
# Assuming 'Join_count' is your dependent variable, and other columns are independent variables
X = df.drop('EVI', axis=1) # Independent variables
y = df['EVI'] # Dependent variable
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
# Check for and handle infinite or NaN values in X_imputed
X_imputed = np.nan_to_num(X_imputed, nan=np.nan, posinf=np.nan, neginf=np.nan)
# Check for and handle infinite or NaN values in y
y = np.nan_to_num(y, nan=np.nan, posinf=np.nan, neginf=np.nan)
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=43)
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
# Elastic Net Regression
elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)
# Support Vector Regression
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
# Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
# Evaluate the models
def evaluate_model(model, y_test, y_pred):
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Evaluate Linear Regression
print("\nLinear Regression:")
evaluate_model(linear_model, y_test, y_pred_linear)
# Evaluate Lasso Regression
print("\nLasso Regression:")
evaluate_model(lasso_model, y_test, y_pred_lasso)
# Evaluate Ridge Regression
print("\nRidge Regression:")
evaluate_model(ridge_model, y_test, y_pred_ridge)
# Evaluate Elastic Net Regression
print("\nElastic Net Regression:")
evaluate_model(elastic_net_model, y_test, y_pred_elastic_net)
# Evaluate Support Vector Regression
print("\nSupport Vector Regression:")
evaluate_model(svr_model, y_test, y_pred_svr)
# Evaluate Decision Tree Regression
print("\nDecision Tree Regression:")
evaluate_model(tree_model, y_test, y_pred_tree)
# Evaluate Random Forest Regression
print("\nRandom Forest Regression:")
evaluate_model(rf_model, y_test, y_pred_rf)
# Cross-Validation
def cross_validate(model, x_data, y_data, cv=5):
y_pred = cross_val_predict(model, x_data, y_data, cv=cv)
r2 = r2_score(y_data, y_pred)
mse = mean_squared_error(y_data, y_pred)
return y_pred, r2, mse
# Cross-Validation for each model
linear_cv_preds, linear_cv_r2, linear_cv_mse = cross_validate(linear_model, X_scaled, y)
lasso_cv_preds, lasso_cv_r2, lasso_cv_mse = cross_validate(lasso_model, X_scaled, y)
ridge_cv_preds, ridge_cv_r2, ridge_cv_mse = cross_validate(ridge_model, X_scaled, y)
elastic_net_cv_preds, elastic_net_cv_r2, elastic_net_cv_mse = cross_validate(elastic_net_model, X_scaled, y)
svr_cv_preds, svr_cv_r2, svr_cv_mse = cross_validate(svr_model, X_scaled, y)
tree_cv_preds, tree_cv_r2, tree_cv_mse = cross_validate(tree_model, X_scaled, y)
rf_cv_preds, rf_cv_r2, rf_cv_mse = cross_validate(rf_model, X_scaled, y)
# Display Cross-Validation Results
cv_results = pd.DataFrame({
'Model': ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression'],
'R-Squared': [linear_cv_r2, lasso_cv_r2, ridge_cv_r2, elastic_net_cv_r2,
svr_cv_r2, tree_cv_r2, rf_cv_r2],
'Mean Squared Error': [linear_cv_mse, lasso_cv_mse, ridge_cv_mse, elastic_net_cv_mse,
svr_cv_mse, tree_cv_mse, rf_cv_mse]
})
print("\nCross-Validation Results:")
print(cv_results)
# Model Multicollinearity Test (VIF)
def calculate_vif(X):
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
return vif_data
# Calculate VIF for independent variables
vif_results = calculate_vif(X)
print("\nVIF Results:")
print(vif_results)
# Plotting predictions vs actual values for each model
models = [linear_model, lasso_model, ridge_model, elastic_net_model, svr_model, tree_model, rf_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression']
plt.figure(figsize=(15, 10))
for model, name in zip(models, model_names):
y_pred = model.predict(X_test)
plt.scatter(y_test, y_pred, label=name)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2) # Diagonal line
plt.xlabel("Actual Enhanced Vegetation Index")
plt.ylabel("Predicted Vegetation Productivity")
plt.title("EVI vegetation productivity - All Regression Models on EMD_Residual Data")
plt.legend()
plt.show()
# Plotting coefficients for Linear Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_linear = pd.Series(linear_model.coef_, index=X.columns)
ft_importances_lm_linear.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Coefficients - Linear Regression")
plt.show()
# Plotting coefficients for Lasso Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_lasso = pd.Series(lasso_model.coef_, index=X.columns)
ft_importances_lm_lasso.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Coefficients - Lasso Regression")
plt.show()
# Plotting coefficients for Ridge Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_ridge = pd.Series(ridge_model.coef_, index=X.columns)
ft_importances_lm_ridge.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Coefficients - Ridge Regression")
plt.show()
# Plotting coefficients for Elastic Net Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_elastic_net = pd.Series(elastic_net_model.coef_, index=X.columns)
ft_importances_lm_elastic_net.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Coefficients - Elastic Net Regression")
plt.show()
# Support Vector Regression does not provide coefficients in the same way as linear models
# Plotting feature importance for Decision Tree Regression
plt.figure(figsize=(15, 10))
ft_importances_tree = pd.Series(tree_model.feature_importances_, index=X.columns)
ft_importances_tree.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Feature Importance - Decision Tree Regression")
plt.show()
# Plotting feature importance for Random Forest Regression
plt.figure(figsize=(15, 10))
ft_importances_rf = pd.Series(rf_model.feature_importances_, index=X.columns)
ft_importances_rf.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Feature Importance - Random Forest Regression")
plt.show()
Linear Regression: Mean Squared Error: 161589.5170867599 R-squared: 0.8306942125812589 Lasso Regression: Mean Squared Error: 161593.50097144092 R-squared: 0.8306900384569402 Ridge Regression: Mean Squared Error: 162331.6835140603 R-squared: 0.8299166060035847 Elastic Net Regression: Mean Squared Error: 165031.10461440653 R-squared: 0.8270882813498048 Support Vector Regression: Mean Squared Error: 298695.61362412694 R-squared: 0.6870409852391317 Decision Tree Regression: Mean Squared Error: 3240.126584973943 R-squared: 0.9966051499336379 Random Forest Regression: Mean Squared Error: 4885.857836475552 R-squared: 0.9948808312374844
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.346e+05, tolerance: 8.838e+04 model = cd_fast.enet_coordinate_descent(
Cross-Validation Results:
Model R-Squared Mean Squared Error
0 Linear Regression 0.001368 942591.458133
1 Lasso Regression 0.002814 941225.938013
2 Ridge Regression 0.024348 920900.635013
3 Elastic Net Regression 0.059235 887971.312044
4 Support Vector Regression 0.325167 636963.229115
5 Decision Tree Regression 0.362003 602193.925694
6 Random Forest Regression 0.710667 273095.921391
VIF Results:
Variable VIF
0 Psum 46.201812
1 Temp 78.081874
2 pp 14.896199
3 gdp 15.015930
4 gova 5.786582
5 aa 5.477537
6 gp 2.874869
7 ls 8.281318
8 fai 9.254266
9 lgr 11.599613
10 iofp 6.333291
11 loh 8.323424
12 rpop 9.204349
13 hmp 11.487775
14 hpb 50.590835
15 mht 61.867141
16 pte 11.457694
17 tcrv 89.898989
18 pcurei 2.675602
19 pio 11.322218
20 sio 13.748914
21 tio 70.182718
22 croppg 3.269623
23 forest 2.350379
24 grass 23.945826
25 sand 2.530238
26 shrub 1.951590
27 urban 1.312915
28 water 2.006455
29 wetland 1.220154
30 bus_p 15.760790
31 dustgdp 29.858260
32 fosgdp 4.169074
33 inter_p 40.202206
34 invpa_p 16.281991
35 mobile_p 7.781903
36 noxgdp 16.342599
37 road_p 42.365681
38 sdxgdp 47.148103
39 taxiv 82.841972
40 telev 17.652349
41 wateru 10.769069
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
# Load the data
data_path = r"C:\Users\Owner\Desktop\GA_data\SVR_Res.csv"
df = pd.read_csv(data_path)
df.head()
# Assuming 'Join_count' is your dependent variable, and other columns are independent variables
X = df.drop('EVI', axis=1) # Independent variables
y = df['EVI'] # Dependent variable
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
# Check for and handle infinite or NaN values in X_imputed
X_imputed = np.nan_to_num(X_imputed, nan=np.nan, posinf=np.nan, neginf=np.nan)
# Check for and handle infinite or NaN values in y
y = np.nan_to_num(y, nan=np.nan, posinf=np.nan, neginf=np.nan)
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=43)
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
# Elastic Net Regression
elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)
# Support Vector Regression
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
# Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
# Evaluate the models
def evaluate_model(model, y_test, y_pred):
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Evaluate Linear Regression
print("\nLinear Regression:")
evaluate_model(linear_model, y_test, y_pred_linear)
# Evaluate Lasso Regression
print("\nLasso Regression:")
evaluate_model(lasso_model, y_test, y_pred_lasso)
# Evaluate Ridge Regression
print("\nRidge Regression:")
evaluate_model(ridge_model, y_test, y_pred_ridge)
# Evaluate Elastic Net Regression
print("\nElastic Net Regression:")
evaluate_model(elastic_net_model, y_test, y_pred_elastic_net)
# Evaluate Support Vector Regression
print("\nSupport Vector Regression:")
evaluate_model(svr_model, y_test, y_pred_svr)
# Evaluate Decision Tree Regression
print("\nDecision Tree Regression:")
evaluate_model(tree_model, y_test, y_pred_tree)
# Evaluate Random Forest Regression
print("\nRandom Forest Regression:")
evaluate_model(rf_model, y_test, y_pred_rf)
# Cross-Validation
def cross_validate(model, x_data, y_data, cv=5):
y_pred = cross_val_predict(model, x_data, y_data, cv=cv)
r2 = r2_score(y_data, y_pred)
mse = mean_squared_error(y_data, y_pred)
return y_pred, r2, mse
# Cross-Validation for each model
linear_cv_preds, linear_cv_r2, linear_cv_mse = cross_validate(linear_model, X_scaled, y)
lasso_cv_preds, lasso_cv_r2, lasso_cv_mse = cross_validate(lasso_model, X_scaled, y)
ridge_cv_preds, ridge_cv_r2, ridge_cv_mse = cross_validate(ridge_model, X_scaled, y)
elastic_net_cv_preds, elastic_net_cv_r2, elastic_net_cv_mse = cross_validate(elastic_net_model, X_scaled, y)
svr_cv_preds, svr_cv_r2, svr_cv_mse = cross_validate(svr_model, X_scaled, y)
tree_cv_preds, tree_cv_r2, tree_cv_mse = cross_validate(tree_model, X_scaled, y)
rf_cv_preds, rf_cv_r2, rf_cv_mse = cross_validate(rf_model, X_scaled, y)
# Display Cross-Validation Results
cv_results = pd.DataFrame({
'Model': ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression'],
'R-Squared': [linear_cv_r2, lasso_cv_r2, ridge_cv_r2, elastic_net_cv_r2,
svr_cv_r2, tree_cv_r2, rf_cv_r2],
'Mean Squared Error': [linear_cv_mse, lasso_cv_mse, ridge_cv_mse, elastic_net_cv_mse,
svr_cv_mse, tree_cv_mse, rf_cv_mse]
})
print("\nCross-Validation Results:")
print(cv_results)
# Model Multicollinearity Test (VIF)
def calculate_vif(X):
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
return vif_data
# Calculate VIF for independent variables
vif_results = calculate_vif(X)
print("\nVIF Results:")
print(vif_results)
# Plotting predictions vs actual values for each model
models = [linear_model, lasso_model, ridge_model, elastic_net_model, svr_model, tree_model, rf_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression']
plt.figure(figsize=(15, 10))
for model, name in zip(models, model_names):
y_pred = model.predict(X_test)
plt.scatter(y_test, y_pred, label=name)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2) # Diagonal line
plt.xlabel("Actual Enhanced Vegetation Index")
plt.ylabel("Predicted Vegetation Productivity")
plt.title("EVI vegetation productivity - All Regression Models on SVR_Residual Data")
plt.legend()
plt.show()
# Plotting coefficients for Linear Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_linear = pd.Series(linear_model.coef_, index=X.columns)
ft_importances_lm_linear.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Coefficients - Linear Regression")
plt.show()
# Plotting coefficients for Lasso Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_lasso = pd.Series(lasso_model.coef_, index=X.columns)
ft_importances_lm_lasso.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Coefficients - Lasso Regression")
plt.show()
# Plotting coefficients for Ridge Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_ridge = pd.Series(ridge_model.coef_, index=X.columns)
ft_importances_lm_ridge.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Coefficients - Ridge Regression")
plt.show()
# Plotting coefficients for Elastic Net Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_elastic_net = pd.Series(elastic_net_model.coef_, index=X.columns)
ft_importances_lm_elastic_net.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Coefficients - Elastic Net Regression")
plt.show()
# Support Vector Regression does not provide coefficients in the same way as linear models
# Plotting feature importance for Decision Tree Regression
plt.figure(figsize=(15, 10))
ft_importances_tree = pd.Series(tree_model.feature_importances_, index=X.columns)
ft_importances_tree.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Feature Importance - Decision Tree Regression")
plt.show()
# Plotting feature importance for Random Forest Regression
plt.figure(figsize=(15, 10))
ft_importances_rf = pd.Series(rf_model.feature_importances_, index=X.columns)
ft_importances_rf.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Feature Importance - Random Forest Regression")
plt.show()
Linear Regression:
Mean Squared Error: 191029.2347223954
R-squared: 0.799848680856509
Lasso Regression:
Mean Squared Error: 191034.26939142178
R-squared: 0.7998434057704806
Ridge Regression:
Mean Squared Error: 191142.80414797275
R-squared: 0.7997296882301881
Elastic Net Regression:
Mean Squared Error: 191608.8948863569
R-squared: 0.7992413405892435
Support Vector Regression:
Mean Squared Error: 282410.0030080256
R-squared: 0.7041042711419767
Decision Tree Regression:
Mean Squared Error: 2026.0063391507622
R-squared: 0.9978772472079293
Random Forest Regression:
Mean Squared Error: 1481.713775843146
R-squared: 0.9984475309904316
Cross-Validation Results:
Model R-Squared Mean Squared Error
0 Linear Regression -1.342138e+15 1.266821e+21
1 Lasso Regression -1.638980e+01 1.641392e+07
2 Ridge Regression -7.271596e+00 7.807413e+06
3 Elastic Net Regression -2.039070e+00 2.868524e+06
4 Support Vector Regression 3.534777e-01 6.102409e+05
5 Decision Tree Regression -3.573694e-01 1.281197e+06
6 Random Forest Regression 2.011485e-01 7.540217e+05
C:\Users\Owner\anaconda3\envs\arcpro39\lib\site-packages\statsmodels\regression\linear_model.py:1783: RuntimeWarning: invalid value encountered in scalar divide return 1 - self.ssr/self.uncentered_tss
VIF Results:
Variable VIF
0 Psum 60.604263
1 Temp 114.843237
2 pp 19.194119
3 gdp 11.391848
4 gova 13.290764
5 aa 5.691484
6 gp 2.829718
7 ls 9.252364
8 fai 13.141497
9 lgr 14.539192
10 iofp 25.967739
11 loh 17.505314
12 rpop 11.057719
13 hmp 13.630785
14 hpb 6.587827
15 mht 31.696345
16 pte 26.365839
17 tcrv NaN
18 pcurei 4.561716
19 pio 29.720630
20 sio 9.738110
21 tio NaN
22 croppg 2.516127
23 forest 2.113349
24 grass 18.374692
25 sand 3.403265
26 shrub 2.102264
27 urban 6.026541
28 water 2.911562
29 wetland 2.310738
30 bus_p 10.432935
31 dustgdp 22.514362
32 fosgdp 11.218869
33 inter_p 4.670754
34 invpa_p 13.128986
35 mobile_p 6.275998
36 noxgdp 14.991900
37 road_p 38.888496
38 sdxgdp 28.007735
39 taxiv 3.264132
40 telev 14.209921
41 wateru 14.883773
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
# Load the data
data_path = r"C:\Users\Owner\Desktop\GA_data\SVR_Agg.csv"
df = pd.read_csv(data_path)
df.head()
# Assuming 'Join_count' is your dependent variable, and other columns are independent variables
X = df.drop('EVI', axis=1) # Independent variables
y = df['EVI'] # Dependent variable
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
# Check for and handle infinite or NaN values in X_imputed
X_imputed = np.nan_to_num(X_imputed, nan=np.nan, posinf=np.nan, neginf=np.nan)
# Check for and handle infinite or NaN values in y
y = np.nan_to_num(y, nan=np.nan, posinf=np.nan, neginf=np.nan)
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=43)
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
# Elastic Net Regression
elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)
# Support Vector Regression
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
# Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
# Evaluate the models
def evaluate_model(model, y_test, y_pred):
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Evaluate Linear Regression
print("\nLinear Regression:")
evaluate_model(linear_model, y_test, y_pred_linear)
# Evaluate Lasso Regression
print("\nLasso Regression:")
evaluate_model(lasso_model, y_test, y_pred_lasso)
# Evaluate Ridge Regression
print("\nRidge Regression:")
evaluate_model(ridge_model, y_test, y_pred_ridge)
# Evaluate Elastic Net Regression
print("\nElastic Net Regression:")
evaluate_model(elastic_net_model, y_test, y_pred_elastic_net)
# Evaluate Support Vector Regression
print("\nSupport Vector Regression:")
evaluate_model(svr_model, y_test, y_pred_svr)
# Evaluate Decision Tree Regression
print("\nDecision Tree Regression:")
evaluate_model(tree_model, y_test, y_pred_tree)
# Evaluate Random Forest Regression
print("\nRandom Forest Regression:")
evaluate_model(rf_model, y_test, y_pred_rf)
# Cross-Validation
def cross_validate(model, x_data, y_data, cv=5):
y_pred = cross_val_predict(model, x_data, y_data, cv=cv)
r2 = r2_score(y_data, y_pred)
mse = mean_squared_error(y_data, y_pred)
return y_pred, r2, mse
# Cross-Validation for each model
linear_cv_preds, linear_cv_r2, linear_cv_mse = cross_validate(linear_model, X_scaled, y)
lasso_cv_preds, lasso_cv_r2, lasso_cv_mse = cross_validate(lasso_model, X_scaled, y)
ridge_cv_preds, ridge_cv_r2, ridge_cv_mse = cross_validate(ridge_model, X_scaled, y)
elastic_net_cv_preds, elastic_net_cv_r2, elastic_net_cv_mse = cross_validate(elastic_net_model, X_scaled, y)
svr_cv_preds, svr_cv_r2, svr_cv_mse = cross_validate(svr_model, X_scaled, y)
tree_cv_preds, tree_cv_r2, tree_cv_mse = cross_validate(tree_model, X_scaled, y)
rf_cv_preds, rf_cv_r2, rf_cv_mse = cross_validate(rf_model, X_scaled, y)
# Display Cross-Validation Results
cv_results = pd.DataFrame({
'Model': ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression'],
'R-Squared': [linear_cv_r2, lasso_cv_r2, ridge_cv_r2, elastic_net_cv_r2,
svr_cv_r2, tree_cv_r2, rf_cv_r2],
'Mean Squared Error': [linear_cv_mse, lasso_cv_mse, ridge_cv_mse, elastic_net_cv_mse,
svr_cv_mse, tree_cv_mse, rf_cv_mse]
})
print("\nCross-Validation Results:")
print(cv_results)
# Model Multicollinearity Test (VIF)
def calculate_vif(X):
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
return vif_data
# Calculate VIF for independent variables
vif_results = calculate_vif(X)
print("\nVIF Results:")
print(vif_results)
# Plotting predictions vs actual values for each model
models = [linear_model, lasso_model, ridge_model, elastic_net_model, svr_model, tree_model, rf_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression']
plt.figure(figsize=(15, 10))
for model, name in zip(models, model_names):
y_pred = model.predict(X_test)
plt.scatter(y_test, y_pred, label=name)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2) # Diagonal line
plt.xlabel("Actual Enhanced Vegetation Index")
plt.ylabel("Predicted Vegetation Productivity")
plt.title("EVI vegetation productivity - All Regression Models on SVR_Aggregate Data")
plt.legend()
plt.show()
# Plotting coefficients for Linear Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_linear = pd.Series(linear_model.coef_, index=X.columns)
ft_importances_lm_linear.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Coefficients - Linear Regression")
plt.show()
# Plotting coefficients for Lasso Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_lasso = pd.Series(lasso_model.coef_, index=X.columns)
ft_importances_lm_lasso.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Coefficients - Lasso Regression")
plt.show()
# Plotting coefficients for Ridge Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_ridge = pd.Series(ridge_model.coef_, index=X.columns)
ft_importances_lm_ridge.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Coefficients - Ridge Regression")
plt.show()
# Plotting coefficients for Elastic Net Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_elastic_net = pd.Series(elastic_net_model.coef_, index=X.columns)
ft_importances_lm_elastic_net.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Coefficients - Elastic Net Regression")
plt.show()
# Support Vector Regression does not provide coefficients in the same way as linear models
# Plotting feature importance for Decision Tree Regression
plt.figure(figsize=(15, 10))
ft_importances_tree = pd.Series(tree_model.feature_importances_, index=X.columns)
ft_importances_tree.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Feature Importance - Decision Tree Regression")
plt.show()
# Plotting feature importance for Random Forest Regression
plt.figure(figsize=(15, 10))
ft_importances_rf = pd.Series(rf_model.feature_importances_, index=X.columns)
ft_importances_rf.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Feature Importance - Random Forest Regression")
plt.show()
Linear Regression: Mean Squared Error: 248917.07236289946 R-squared: 0.7678656394878618 Lasso Regression: Mean Squared Error: 248933.91214058246 R-squared: 0.7678499350968913 Ridge Regression: Mean Squared Error: 248714.65693129468 R-squared: 0.7680544074832721 Elastic Net Regression: Mean Squared Error: 248460.46874279427 R-squared: 0.768291457565965 Support Vector Regression: Mean Squared Error: 368410.78910387954 R-squared: 0.6564285361282114 Decision Tree Regression: Mean Squared Error: 120936.7605888516 R-squared: 0.8872171469991669 Random Forest Regression: Mean Squared Error: 69221.83308766509 R-squared: 0.9354453039128781
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.909e+05, tolerance: 9.482e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.861e+05, tolerance: 9.569e+04 model = cd_fast.enet_coordinate_descent(
Cross-Validation Results:
Model R-Squared Mean Squared Error
0 Linear Regression -1.406768 2.456676e+06
1 Lasso Regression -1.387017 2.436516e+06
2 Ridge Regression -0.551109 1.583274e+06
3 Elastic Net Regression 0.086989 9.319438e+05
4 Support Vector Regression 0.444086 5.674422e+05
5 Decision Tree Regression 0.543226 4.662462e+05
6 Random Forest Regression 0.697915 3.083493e+05
VIF Results:
Variable VIF
0 Psum 21.349290
1 Temp 64.008951
2 pp 102.315860
3 gdp 24.284434
4 gova 21.969581
5 aa 11.697837
6 gp 7.883940
7 ls 2.598890
8 fai 12.097485
9 lgr 21.400360
10 iofp 3.015758
11 loh 6.784207
12 rpop 23.224089
13 hmp 42.382273
14 hpb 36.258598
15 mht 77.773037
16 pte 21.528149
17 tcrv 58.645647
18 pcurei 2.776087
19 pio 33.625959
20 sio 27.232431
21 tio 51.229273
22 croppg 2.817496
23 forest 1.698973
24 grass 26.368864
25 sand 2.378605
26 shrub 1.474054
27 urban 1.186559
28 water 1.302964
29 wetland 1.274774
30 bus_p 14.514227
31 dustgdp 1.483317
32 fosgdp 4.335904
33 inter_p 14.804122
34 invpa_p 13.934623
35 mobile_p 4.929697
36 noxgdp 16.101705
37 road_p 34.033373
38 sdxgdp 3.073364
39 taxiv 65.064443
40 telev 9.360795
41 wateru 14.420194
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
# Load the data
data_path = r"C:\Users\Owner\Desktop\GA_data\TheiSenSlopes.csv"
df = pd.read_csv(data_path)
df.head()
# Assuming 'Join_count' is your dependent variable, and other columns are independent variables
X = df.drop('EVI', axis=1) # Independent variables
y = df['EVI'] # Dependent variable
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
# Check for and handle infinite or NaN values in X_imputed
X_imputed = np.nan_to_num(X_imputed, nan=np.nan, posinf=np.nan, neginf=np.nan)
# Check for and handle infinite or NaN values in y
y = np.nan_to_num(y, nan=np.nan, posinf=np.nan, neginf=np.nan)
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=43)
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
# Elastic Net Regression
elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)
# Support Vector Regression
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
# Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
# Evaluate the models
def evaluate_model(model, y_test, y_pred):
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Evaluate Linear Regression
print("\nLinear Regression:")
evaluate_model(linear_model, y_test, y_pred_linear)
# Evaluate Lasso Regression
print("\nLasso Regression:")
evaluate_model(lasso_model, y_test, y_pred_lasso)
# Evaluate Ridge Regression
print("\nRidge Regression:")
evaluate_model(ridge_model, y_test, y_pred_ridge)
# Evaluate Elastic Net Regression
print("\nElastic Net Regression:")
evaluate_model(elastic_net_model, y_test, y_pred_elastic_net)
# Evaluate Support Vector Regression
print("\nSupport Vector Regression:")
evaluate_model(svr_model, y_test, y_pred_svr)
# Evaluate Decision Tree Regression
print("\nDecision Tree Regression:")
evaluate_model(tree_model, y_test, y_pred_tree)
# Evaluate Random Forest Regression
print("\nRandom Forest Regression:")
evaluate_model(rf_model, y_test, y_pred_rf)
# Cross-Validation
def cross_validate(model, x_data, y_data, cv=5):
y_pred = cross_val_predict(model, x_data, y_data, cv=cv)
r2 = r2_score(y_data, y_pred)
mse = mean_squared_error(y_data, y_pred)
return y_pred, r2, mse
# Cross-Validation for each model
linear_cv_preds, linear_cv_r2, linear_cv_mse = cross_validate(linear_model, X_scaled, y)
lasso_cv_preds, lasso_cv_r2, lasso_cv_mse = cross_validate(lasso_model, X_scaled, y)
ridge_cv_preds, ridge_cv_r2, ridge_cv_mse = cross_validate(ridge_model, X_scaled, y)
elastic_net_cv_preds, elastic_net_cv_r2, elastic_net_cv_mse = cross_validate(elastic_net_model, X_scaled, y)
svr_cv_preds, svr_cv_r2, svr_cv_mse = cross_validate(svr_model, X_scaled, y)
tree_cv_preds, tree_cv_r2, tree_cv_mse = cross_validate(tree_model, X_scaled, y)
rf_cv_preds, rf_cv_r2, rf_cv_mse = cross_validate(rf_model, X_scaled, y)
# Display Cross-Validation Results
cv_results = pd.DataFrame({
'Model': ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression'],
'R-Squared': [linear_cv_r2, lasso_cv_r2, ridge_cv_r2, elastic_net_cv_r2,
svr_cv_r2, tree_cv_r2, rf_cv_r2],
'Mean Squared Error': [linear_cv_mse, lasso_cv_mse, ridge_cv_mse, elastic_net_cv_mse,
svr_cv_mse, tree_cv_mse, rf_cv_mse]
})
print("\nCross-Validation Results:")
print(cv_results)
# Model Multicollinearity Test (VIF)
def calculate_vif(X):
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
return vif_data
# Calculate VIF for independent variables
vif_results = calculate_vif(X)
print("\nVIF Results:")
print(vif_results)
# Plotting predictions vs actual values for each model
models = [linear_model, lasso_model, ridge_model, elastic_net_model, svr_model, tree_model, rf_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression']
plt.figure(figsize=(15, 10))
for model, name in zip(models, model_names):
y_pred = model.predict(X_test)
plt.scatter(y_test, y_pred, label=name)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2) # Diagonal line
plt.xlabel("Actual Enhanced Vegetation Index")
plt.ylabel("Predicted Vegetation Productivity")
plt.title("EVI vegetation productivity - All Regression Models on TheiSenSlopes Data")
plt.legend()
plt.show()
# Plotting coefficients for Linear Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_linear = pd.Series(linear_model.coef_, index=X.columns)
ft_importances_lm_linear.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Coefficients - Linear Regression")
plt.show()
# Plotting coefficients for Lasso Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_lasso = pd.Series(lasso_model.coef_, index=X.columns)
ft_importances_lm_lasso.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Coefficients - Lasso Regression")
plt.show()
# Plotting coefficients for Ridge Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_ridge = pd.Series(ridge_model.coef_, index=X.columns)
ft_importances_lm_ridge.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Coefficients - Ridge Regression")
plt.show()
# Plotting coefficients for Elastic Net Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_elastic_net = pd.Series(elastic_net_model.coef_, index=X.columns)
ft_importances_lm_elastic_net.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Coefficients - Elastic Net Regression")
plt.show()
# Support Vector Regression does not provide coefficients in the same way as linear models
# Plotting feature importance for Decision Tree Regression
plt.figure(figsize=(15, 10))
ft_importances_tree = pd.Series(tree_model.feature_importances_, index=X.columns)
ft_importances_tree.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Feature Importance - Decision Tree Regression")
plt.show()
# Plotting feature importance for Random Forest Regression
plt.figure(figsize=(15, 10))
ft_importances_rf = pd.Series(rf_model.feature_importances_, index=X.columns)
ft_importances_rf.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Feature Importance - Random Forest Regression")
plt.show()
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.813e+07, tolerance: 8.208e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.089e+07, tolerance: 8.208e+04 model = cd_fast.enet_coordinate_descent(
Linear Regression: Mean Squared Error: 85477.52029500603 R-squared: 0.912689865591831 Lasso Regression: Mean Squared Error: 86436.38893434322 R-squared: 0.9117104390772178 Ridge Regression: Mean Squared Error: 84944.32132575687 R-squared: 0.9132344961977583 Elastic Net Regression: Mean Squared Error: 87490.01617333169 R-squared: 0.9106342223650963 Support Vector Regression: Mean Squared Error: 211958.5823537373 R-squared: 0.7834970849598806 Decision Tree Regression: Mean Squared Error: 7503.955761272335 R-squared: 0.9923351615272826 Random Forest Regression: Mean Squared Error: 16989.891045389293 R-squared: 0.9826458504454328
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.897e+07, tolerance: 8.777e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.418e+07, tolerance: 8.954e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.802e+07, tolerance: 9.770e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.694e+07, tolerance: 8.864e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.989e+07, tolerance: 1.028e+05 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.684e+07, tolerance: 8.777e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.086e+07, tolerance: 8.954e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.848e+07, tolerance: 9.770e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.365e+07, tolerance: 8.864e+04 model = cd_fast.enet_coordinate_descent( C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.738e+07, tolerance: 1.028e+05 model = cd_fast.enet_coordinate_descent(
Cross-Validation Results:
Model R-Squared Mean Squared Error
0 Linear Regression -2.559135 3.434514e+06
1 Lasso Regression -1.955554 2.852066e+06
2 Ridge Regression -0.508402 1.455585e+06
3 Elastic Net Regression 0.154341 8.160482e+05
4 Support Vector Regression 0.586065 3.994415e+05
5 Decision Tree Regression 0.577707 4.075065e+05
6 Random Forest Regression 0.752843 2.385031e+05
VIF Results:
Variable VIF
0 Psum 95.325265
1 Temp 138.755380
2 pp 313.749531
3 gdp 538.821338
4 gova 354.097648
5 aa 16.568545
6 gp 15.482781
7 ls 14.244668
8 fai 185.895653
9 lgr 199.701678
10 iofp 4.230339
11 loh 10.225547
12 rpop 71.771812
13 hmp 64.460563
14 hpb 193.395111
15 mht 164.132824
16 pte 53.759338
17 tcrv 446.530381
18 pcurei 88.105709
19 pio 343.920908
20 sio 109.362204
21 tio 205.115146
22 croppg 7.110237
23 forest 15.110842
24 grass 48.239407
25 sand 3.094659
26 shrub 12.424985
27 urban 3.272255
28 water 2.649976
29 wetland 1.809359
30 bus_p 403.303588
31 dustgdp 2970.874935
32 fosgdp 17.101441
33 inter_p 167.704628
34 invpa_p 39.852756
35 mobile_p 288.768518
36 noxgdp 532.880801
37 road_p 133.143666
38 sdxgdp 2080.111808
39 taxiv 256.607063
40 telev 340.640595
41 wateru 89.470541